"""@author: sarjanshrestha"""

import collections
import numpy as np
import itertools
import sys


def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False
    
def continuous_Prob(x,m,s):
    result=round(np.exp(-np.square(x-m)/2*np.square(s)))/np.sqrt(2*np.pi*np.square(s))
    return result




# print 'training File name is : ',sys.argv[1]
# 
# print 'test File name is : ',sys.argv[2]
# print 'training number  is : ',sys.argv[3]

f=open('C:\Python27\kddcup.data_10_percent.txt','r') 

f.seek(0)

columns=collections.defaultdict(int)
probClass=collections.defaultdict(int)
jointCount=collections.defaultdict(int)
jointProbClass=collections.defaultdict(int)
classAttribute=[]
predictCounts=collections.defaultdict(int)
predictClass=collections.defaultdict(int)
predictResult={}
data=[]
coln=[]
mu=[]
sd=[]
maxm=[]



for line in f:
    vL=len(line.split(','))                 #count the number of columns in table
    break



f.seek(0)
lineTrain= 0
for line in f:
    if lineTrain > 10000:                     #total number of rows used for training bayesian network
        break
    lineTrain+=1
    data.append(line.strip().strip('\n').split(','))

f.close()
    
for rows in data:
    for counter in range(len(rows)):
        if is_number(rows[counter]) and (counter not in coln):
            coln.append(counter)                                        #find the number of columns with continous values

        if (rows[len(rows)-1]) not in classAttribute:                   #get the number of classes to predict the given instance
                classAttribute.append((rows[len(rows)-1]))


data1=[]
data2=[]
 
for line in data:
    if (line[len(line)-1]=='normal.'):
        data1.append(line)
    else:
        data2.append(line)
        
nCount=len(data1)                   #total number of normal event counts
aCount=len(data2)                   #total number of anomaly event counts

probN=np.float64(nCount)/np.float64(len(data))     #probability of normal events
probA=np.float64(aCount)/np.float64(len(data))      #probability of anomaly events


arrN=np.array(data1)
arrA=np.array(data2)


arrN1=np.zeros((len(data1),len(coln)))


for rows,j in itertools.izip(arrN,range(len(arrN))):                           # get array of numbers from data1
    for count,i in itertools.izip(coln,range(len(coln))):
        arrN1[j][i]=rows[count]
        
for i in range(len(coln)):                                                  #avoid zero probability
    if np.sum(arrN1[:,i])==0:
        arrN1[1][i]=1



   
arrA1=np.zeros((len(data2),len(coln)))
        
for rows,j in itertools.izip(arrA,range(len(arrA))):                           # get array of numbers from data2
    for count,i in itertools.izip(coln,range(len(coln))):
        arrA1[j][i]=rows[count]
       

for i in range(len(coln)):                                                  #avoid zero probability
    if np.sum(arrA1[:,i])==0:
        arrA1[1][i]=1        

muN=np.zeros((len(coln)))
stdN=np.zeros((len(coln)))


for dv in arrN1:
    for count in range(len(dv)):
        muN[count]=np.mean(arrN1[:,count])
        stdN[count]=np.std(arrN1[:,count])
    break

muA=np.zeros((len(coln)))
stdA=np.zeros((len(coln)))

      
for dv in arrA1:
    for count in range(len(dv)):
        muA[count]=np.mean(arrA1[:,count])
        stdA[count]=np.std(arrA1[:,count])
    break


"""Train the bayes_network"""            
for rows in data:
    for counter in range(len(rows)-1):
        if is_number(rows[counter]):
            continue
        else:
            predictCounts[rows[counter]]+=1                            # Count the number of normal and abnormal events with counter=len(rows), ie last column of table
for rows in data:
    if rows[len(rows)-1]=='normal.':
        predictCounts['normal.']+=1
    else:
        predictCounts['anomaly.']+=1             
 
for rows in data:
    for counter in range(len(rows)-1):
        if is_number(rows[counter]):
            continue
        elif rows[len(rows)-1]=='normal.':
            jointCount[rows[counter],'normal.']+=1
        else:
            jointCount[rows[counter],'anomaly.']+=1


"Compute the probability of normal and anomaly events"

probNE=1                #probability of normal events
probAE=1                #probability of anomaly events
np.float64(probNE)
np.float64(probAE)
probNE=round(predictCounts['normal.'])/len(data)
probAE=round(predictCounts['anomaly.'])/len(data)


 
"""Compute joint probability"""
for key in jointCount:
    for attr in ['normal.','anomaly.']:
        if key[1]=='normal.':
            jointProbClass[(key)]=round(jointCount[(key)])/round(predictCounts['normal.'])
        else:
            jointProbClass[(key)]=round(jointCount[(key)])/round(predictCounts['anomaly.'])
     
    
 
"""Classify the given instance"""
 
tst=open('C:\Python27\corrected','r')              # location for testing file
 
testVector=[]
probVectorN=np.zeros(len(coln))
probVectorA=np.zeros(len(coln))
# probN=[]
# probA=[]

# probN=1
# probA=1 
lineIndex =  0
for line in tst:
    
    if lineIndex >100:                                 #Total number of rows used for testing data
        break
    lineIndex +=1
    testVector.append(line.strip().strip('\n').split(','))
    
    
#probVectorN=1
#probVectorA=1
probVectorN=np.ones((len(testVector),len(coln)))
probVectorA=np.ones((len(testVector),len(coln)))
np.float64(probVectorN)
np.float64(probVectorA) 

probN=np.ones((len(testVector)))
probA=np.ones((len(testVector)))


"Compute the probability of normal event given attributes"
arrT=np.zeros((len(testVector),len(coln)))
        
for rows,j in itertools.izip(testVector,range(len(testVector))):                           # get array of numbers from data2
    for count,i in itertools.izip(coln,range(len(coln))):
        arrT[j][i]=rows[count]

resultClass=[]
          

for rows,rN in itertools.izip(arrT,range(len(arrT))):
    for counter in range(len(coln)):
        probVectorN[rN][counter] = np.float64(np.exp((-np.square(rows[counter])-muN[counter])/2*np.square(stdN[counter]))/np.sqrt(2*np.pi*np.square(stdN[counter])))
        probVectorA[rN][counter] = np.float64(np.exp((-np.square(rows[counter])-muA[counter])/2*np.square(stdA[counter]))/np.sqrt(2*np.pi*np.square(stdA[counter])))
              
for tv,testL in itertools.izip(testVector,range(len(testVector))):
    for counter in range(len(tv)-1):
                if is_number(tv[counter]):
                    continue
                elif tv[(len(tv)-1)]=='normal.':
                    if (tv[counter],'normal.') not in jointCount.keys():                  #smoothing the joint probability for any unencountered keys during training
                        probN[testL]*=1/len(data) 
                    else: 
                        probN[testL]*=jointProbClass[(tv[counter],'normal.')]
                else:
                    if (tv[counter],'anomaly.') not in jointCount.keys():                    #smoothing the joint probability for any unencountered keys during training
                        probA[testL]*=1/len(data)           
                    else:
                        probA[testL]*=jointProbClass[(tv[counter],'anomaly.')]
    
    for i in probVectorN[testL]:
            if i==0:                                                # if probability is zero replace with occurrence of one in whole data
                probN[testL]*=1/len(data)
            else:
                probN[testL]*=i                                     # if probability is zero replace with occurrence of one in whole data
    
    probN[testL]*=probNE                           #multiplied by the probability of normal event
       
    for j in probVectorA[testL]:
            if j==0:
                probN[testL]*=1/len(data)
            else:
                probA[testL]*=j
        
    
    probA[testL]*=probAE                           #multiplied by the probability of anomaly event
    
    "Predict the class for the given instance"
    if probN[testL]>probA[testL]:
        resultClass.append('normal.')
       # print "The given activity is Normal"
    else:
        resultClass.append('anomaly.')
       # print "Intrusion has been observed" 
        
 
"Estimate the accuracy of the prediction"
tCorrect=0.0

for rows,i in itertools.izip(testVector,range(len(testVector))):
    if rows[(len(rows)-1)]==resultClass[i]:
        tCorrect+=1
    elif rows[(len(rows)-1)]!='normal.' and resultClass[i]=='anomaly.':
        tCorrect+=1
    else:
        continue    
           
estimation = tCorrect/len(testVector)

print "the estimation of the Bayesian Test is:",estimation    
error= 1-estimation
print "the error rate of the Bayesian Test is:",error
